xxxxxxxxxx# importing librariesimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport missingno as msnoimport warningswarnings.filterwarnings('ignore')from sklearn.model_selection import train_test_split, GridSearchCVfrom sklearn.preprocessing import StandardScalerfrom sklearn.metrics import accuracy_score, confusion_matrix, classification_reportfrom sklearn.linear_model import LogisticRegressionfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.svm import SVCfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import GradientBoostingClassifierfrom xgboost import XGBClassifierfrom catboost import CatBoostClassifierfrom sklearn.ensemble import ExtraTreesClassifierfrom lightgbm import LGBMClassifierfrom sklearn.ensemble import VotingClassifierimport foliumfrom folium.plugins import HeatMapimport plotly.express as pxplt.style.use('fivethirtyeight')%matplotlib inlinepd.set_option('display.max_columns', 32)xxxxxxxxxx# reading datadf = pd.read_csv("C:/Users/VANAM GANESH/Downloads/hotel_bookings.csv/hotel_bookings.csv")df.head()xxxxxxxxxxdf.describe()xxxxxxxxxxdf.info()# checking for null values null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})null# filling null values with zerodf.fillna(0, inplace = True)# visualizing null valuesmsno.bar(df)plt.show()# adults, babies and children cant be zero at same time, so dropping the rows having all these zero at same timefilter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)df[filter]xxxxxxxxxxdf = df[~filter]dfxxxxxxxxxxxxxxxxxxxxcountry_wise_guests = df[df['is_canceled'] == 0]['country'].value_counts().reset_index()country_wise_guests.columns = ['country', 'No of guests']country_wise_guestsxxxxxxxxxxbasemap = folium.Map()guests_map = px.choropleth(country_wise_guests, locations = country_wise_guests['country'], color = country_wise_guests['No of guests'], hover_name = country_wise_guests['country'])guests_map.show()xxxxxxxxxxxxxxxxxxxxdf.head()xxxxxxxxxxxxxxxxxxxxdata = df[df['is_canceled'] == 0]px.box(data_frame = data, x = 'reserved_room_type', y = 'adr', color = 'hotel', template = 'plotly_dark')xxxxxxxxxxxxxxxxxxxxdata_resort = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]data_city = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)]xxxxxxxxxxresort_hotel = data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()resort_hotelxxxxxxxxxxcity_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()city_hotelxxxxxxxxxxfinal_hotel = resort_hotel.merge(city_hotel, on = 'arrival_date_month')final_hotel.columns = ['month', 'price_for_resort', 'price_for_city_hotel']final_hotelxxxxxxxxxx!pip install sort-dataframeby-monthorweek!pip install sorted-months-weekdaysxxxxxxxxxximport sort_dataframeby_monthorweek as sddef sort_month(df, column_name): return sd.Sort_Dataframeby_Month(df, column_name)xxxxxxxxxxfinal_prices = sort_month(final_hotel, 'month')final_pricesxxxxxxxxxxplt.figure(figsize = (17, 8))px.line(final_prices, x = 'month', y = ['price_for_resort','price_for_city_hotel'], title = 'Room price per night over the Months', template = 'plotly_dark')xxxxxxxxxxxxxxxxxxxxresort_guests = data_resort['arrival_date_month'].value_counts().reset_index()resort_guests.columns=['month','no of guests']resort_guestsxxxxxxxxxxcity_guests = data_city['arrival_date_month'].value_counts().reset_index()city_guests.columns=['month','no of guests']city_guestsxxxxxxxxxxfinal_guests = resort_guests.merge(city_guests,on='month')final_guests.columns=['month','no of guests in resort','no of guest in city hotel']final_guestsxxxxxxxxxxfinal_guests = sort_month(final_guests,'month')final_guestsxxxxxxxxxxpx.line(final_guests, x = 'month', y = ['no of guests in resort','no of guest in city hotel'], title='Total no of guests per Months', template = 'plotly_dark')xxxxxxxxxxxxxxxxxxxxfilter = df['is_canceled'] == 0data = df[filter]data.head()xxxxxxxxxxdata['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']data.head()xxxxxxxxxxstay = data.groupby(['total_nights', 'hotel']).agg('count').reset_index()stay = stay.iloc[:, :3]stay = stay.rename(columns={'is_canceled':'Number of stays'})stayxxxxxxxxxxpx.bar(data_frame = stay, x = 'total_nights', y = 'Number of stays', color = 'hotel', barmode = 'group', template = 'plotly_dark')xxxxxxxxxxxxxxxxxxxxplt.figure(figsize = (24, 12))corr = df.corr()sns.heatmap(corr, annot = True, linewidths = 1)plt.show()xxxxxxxxxxcorrelation = df.corr()['is_canceled'].abs().sort_values(ascending = False)correlationxxxxxxxxxx# dropping columns that are not usefuluseless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes', 'reservation_status', 'country', 'days_in_waiting_list']df.drop(useless_col, axis = 1, inplace = True)xxxxxxxxxxdf.head()xxxxxxxxxx# creating numerical and categorical dataframescat_cols = [col for col in df.columns if df[col].dtype == 'O']cat_colsxxxxxxxxxxcat_df = df[cat_cols]cat_df.head()xxxxxxxxxxcat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])cat_df['year'] = cat_df['reservation_status_date'].dt.yearcat_df['month'] = cat_df['reservation_status_date'].dt.monthcat_df['day'] = cat_df['reservation_status_date'].dt.dayxxxxxxxxxxcat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)xxxxxxxxxxcat_df.head()# printing unique values of each columnfor col in cat_df.columns: print(f"{col}: \n{cat_df[col].unique()}\n")xxxxxxxxxx# encoding categorical variablescat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3, 'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3, 'GDS': 4})cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6, 'L': 7, 'B': 8})cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})xxxxxxxxxxcat_df.head()xxxxxxxxxxnum_df = df.drop(columns = cat_cols, axis = 1)num_df.drop('is_canceled', axis = 1, inplace = True)num_dfxxxxxxxxxxnum_df.var()xxxxxxxxxx# normalizing numerical variablesnum_df['lead_time'] = np.log(num_df['lead_time'] + 1)num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)num_df['agent'] = np.log(num_df['agent'] + 1)num_df['company'] = np.log(num_df['company'] + 1)num_df['adr'] = np.log(num_df['adr'] + 1)xxxxxxxxxxnum_df.var()xxxxxxxxxxnum_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())xxxxxxxxxxnum_df.head()xxxxxxxxxxX = pd.concat([cat_df, num_df], axis = 1)y = df['is_canceled']xxxxxxxxxxX.shape, y.shapexxxxxxxxxx# splitting data into training set and test setX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)xxxxxxxxxxX_train.head()xxxxxxxxxxX_test.head()xxxxxxxxxxy_train.head(), y_test.head()xxxxxxxxxxxxxxxxxxxxlr = LogisticRegression()lr.fit(X_train, y_train)y_pred_lr = lr.predict(X_test)acc_lr = accuracy_score(y_test, y_pred_lr)conf = confusion_matrix(y_test, y_pred_lr)clf_report = classification_report(y_test, y_pred_lr)print(f"Accuracy Score of Logistic Regression is : {acc_lr}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxknn = KNeighborsClassifier()X_train = np.ascontiguousarray(X_train)X_test = np.ascontiguousarray(X_test)knn.fit(X_train, y_train)y_pred_knn = knn.predict(X_test)acc_knn = accuracy_score(y_test, y_pred_knn)conf = confusion_matrix(y_test, y_pred_knn)clf_report = classification_report(y_test, y_pred_knn)print(f"Accuracy Score of KNN is : {acc_knn}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxdtc = DecisionTreeClassifier()dtc.fit(X_train, y_train)y_pred_dtc = dtc.predict(X_test)acc_dtc = accuracy_score(y_test, y_pred_dtc)conf = confusion_matrix(y_test, y_pred_dtc)clf_report = classification_report(y_test, y_pred_dtc)print(f"Accuracy Score of Decision Tree is : {acc_dtc}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxrd_clf = RandomForestClassifier()rd_clf.fit(X_train, y_train)y_pred_rd_clf = rd_clf.predict(X_test)acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)conf = confusion_matrix(y_test, y_pred_rd_clf)clf_report = classification_report(y_test, y_pred_rd_clf)print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxada = AdaBoostClassifier(base_estimator = dtc)ada.fit(X_train, y_train)y_pred_ada = ada.predict(X_test)acc_ada = accuracy_score(y_test, y_pred_ada)conf = confusion_matrix(y_test, y_pred_ada)clf_report = classification_report(y_test, y_pred_ada)print(f"Accuracy Score of Ada Boost Classifier is : {acc_ada}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxgb = GradientBoostingClassifier()gb.fit(X_train, y_train)y_pred_gb = gb.predict(X_test)acc_gb = accuracy_score(y_test, y_pred_gb)conf = confusion_matrix(y_test, y_pred_gb)clf_report = classification_report(y_test, y_pred_gb)print(f"Accuracy Score of Ada Boost Classifier is : {acc_gb}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)xgb.fit(X_train, y_train)y_pred_xgb = xgb.predict(X_test)acc_xgb = accuracy_score(y_test, y_pred_xgb)conf = confusion_matrix(y_test, y_pred_xgb)clf_report = classification_report(y_test, y_pred_xgb)print(f"Accuracy Score of Ada Boost Classifier is : {acc_xgb}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxcat = CatBoostClassifier(iterations=100)cat.fit(X_train, y_train)y_pred_cat = cat.predict(X_test)acc_cat = accuracy_score(y_test, y_pred_cat)conf = confusion_matrix(y_test, y_pred_cat)clf_report = classification_report(y_test, y_pred_cat)xxxxxxxxxxxxxxxxxxxxetc = ExtraTreesClassifier()etc.fit(X_train, y_train)y_pred_etc = etc.predict(X_test)acc_etc = accuracy_score(y_test, y_pred_etc)conf = confusion_matrix(y_test, y_pred_etc)clf_report = classification_report(y_test, y_pred_etc)print(f"Accuracy Score of Ada Boost Classifier is : {acc_etc}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxlgbm = LGBMClassifier(learning_rate = 1)lgbm.fit(X_train, y_train)y_pred_lgbm = lgbm.predict(X_test)acc_lgbm = accuracy_score(y_test, y_pred_lgbm)conf = confusion_matrix(y_test, y_pred_lgbm)clf_report = classification_report(y_test, y_pred_lgbm)print(f"Accuracy Score of Ada Boost Classifier is : {acc_lgbm}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxclassifiers = [('Gradient Boosting Classifier', gb), ('Cat Boost Classifier', cat), ('XGboost', xgb), ('Decision Tree', dtc), ('Extra Tree', etc), ('Light Gradient', lgbm), ('Random Forest', rd_clf), ('Ada Boost', ada), ('Logistic', lr), ('Knn', knn)]vc = VotingClassifier(estimators = classifiers)vc.fit(X_train, y_train)xxxxxxxxxxy_pred_vc = vc.predict(X_test)acc_vtc = accuracy_score(y_test, y_pred_vc)conf = confusion_matrix(y_test, y_pred_vc)clf_report = classification_report(y_test, y_pred_vc)print(f"Accuracy Score of Ada Boost Classifier is : {acc_vtc}")print(f"Confusion Matrix : \n{conf}")print(f"Classification Report : \n{clf_report}")xxxxxxxxxxxxxxxxxxxxfrom tensorflow.keras.utils import to_categoricalX = pd.concat([cat_df, num_df], axis = 1)y = to_categorical(df['is_canceled'])xxxxxxxxxx# splitting data into training set and test setX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)xxxxxxxxxximport kerasfrom keras.layers import Densefrom keras.models import Sequentialmodel = Sequential()model.add(Dense(100, activation = 'relu', input_shape = (26, )))model.add(Dense(100, activation = 'relu'))model.add(Dense(2, activation = 'sigmoid'))model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])model_history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 100)xxxxxxxxxxplt.figure(figsize = (12, 6))train_loss = model_history.history['loss']val_loss = model_history.history['val_loss'] epoch = range(1, 101)loss = pd.DataFrame({'train_loss' : train_loss, 'val_loss' : val_loss})px.line(data_frame = loss, x = epoch, y = ['val_loss', 'train_loss'], title = 'Training and Validation Loss', template = 'plotly_dark')xxxxxxxxxxplt.figure(figsize = (12, 6))train_acc = model_history.history['accuracy']val_acc = model_history.history['val_accuracy'] epoch = range(1, 101)accuracy = pd.DataFrame({'train_acc' : train_acc, 'val_acc' : val_acc})px.line(data_frame = accuracy, x = epoch, y = ['val_acc', 'train_acc'], title = 'Training and Validation Accuracy', template = 'plotly_dark')xxxxxxxxxxacc_ann = model.evaluate(X_test, y_test)[1]print(f'Accuracy of model is {acc_ann}')xxxxxxxxxxxxxxxxxxxxmodels = pd.DataFrame({ 'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier', 'Gradient Boosting Classifier', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier', 'LGBM', 'Voting Classifier' , 'ANN'], 'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf, acc_ada, acc_gb, acc_xgb, acc_cat, acc_etc, acc_lgbm, acc_vtc, acc_ann]})models.sort_values(by = 'Score', ascending = False)xxxxxxxxxxpx.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', title = 'Models Comparison')xxxxxxxxxx